Wykorzystane biblioteki
library(dplyr)
library(ggplot2)
library(plotly)
library(tidyr)
library(knitr)
library(corrplot)
library(caret)
library(data.table)
Wczytywanie danych
removable_columns <- c("title", "pdb_code", "res_id", "chain_id", "local_res_atom_count", "local_res_atom_non_h_occupancy_sum", "local_res_atom_non_h_electron_occupancy_sum", "local_res_atom_C_count", "local_res_atom_N_count", "local_res_atom_O_count", "local_res_atom_S_count", "dict_atom_C_count", "dict_atom_N_count", "dict_atom_O_count", "dict_atom_S_count", "skeleton_data", "skeleton_cycle_4", "skeleton_diameter", "skeleton_cycle_6", "skeleton_cycle_7", "skeleton_closeness_006_008", "skeleton_closeness_002_004", "skeleton_cycle_3", "skeleton_avg_degree", "skeleton_closeness_004_006", "skeleton_closeness_010_012", "skeleton_closeness_012_014", "skeleton_edges", "skeleton_radius", "skeleton_cycle_8_plus", "skeleton_closeness_020_030", "skeleton_deg_5_plus", "skeleton_closeness_016_018", "skeleton_closeness_008_010", "skeleton_closeness_018_020", "skeleton_average_clustering", "skeleton_closeness_040_050", "skeleton_closeness_014_016", "skeleton_center", "skeleton_closeness_000_002", "skeleton_density", "skeleton_closeness_030_040", "skeleton_deg_4", "skeleton_deg_0", "skeleton_deg_1", "skeleton_deg_2", "skeleton_deg_3", "skeleton_graph_clique_number", "skeleton_nodes", "skeleton_cycles", "skeleton_cycle_5", "skeleton_closeness_050_plus", "skeleton_periphery", "fo_col", "fc_col", "weight_col", "grid_space", "solvent_radius", "solvent_opening_radius", "part_step_FoFc_std_min", "part_step_FoFc_std_max", "part_step_FoFc_std_step")
data <- fread("./all_summary.csv", nrows = 10000, header = TRUE, drop = removable_columns)
dim(data)
## [1] 10000 350
Przetwarzanie brakujących danych
dim(data)
## [1] 10000 350
data <- data %>%
drop_na()
dim(data)
## [1] 8958 350
Usuwanie niepotrzebnych ligandów
deletable_res_name <- c("UNK", "UNX", "UNL", "DUM", "N", "BLOB", "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE", "LEU", "LYS", "MET", "MSE", "PHE", "PRO", "SEC", "SER", "THR", "TRP", "TYR", "VAL", "DA", "DG", "DT", "DC", "DU", "A", "G", "T", "C", "U", "HOH", "H20", "WAT")
data <- data %>% filter(!res_name %in% deletable_res_name)
dim(data)
## [1] 8910 350
Podsumowanie danych
statistics <- data %>%
select(res_name, blob_volume_coverage, blob_volume_coverage_second)
kable(summary(statistics))
|
Length:8910 |
Min. :0.02305 |
Min. :0.00000 |
|
Class :character |
1st Qu.:0.50648 |
1st Qu.:0.00000 |
|
Mode :character |
Median :0.72244 |
Median :0.00000 |
|
NA |
Mean :0.66784 |
Mean :0.02067 |
|
NA |
3rd Qu.:0.86480 |
3rd Qu.:0.00000 |
|
NA |
Max. :1.00000 |
Max. :0.95385 |
dim(data)
## [1] 8910 350
50 najpopularniejszych ligandów
popular_ligands <- data %>%
select(res_name) %>%
count(res_name, sort = TRUE) %>%
slice(1:50)
popular_names_vector <- popular_ligands %>%
pull(res_name)
data <- data %>% filter(res_name %in% popular_names_vector)
dim(data)
## [1] 6239 350
Liczność najpopularniejszych ligandów według nazwy
plot_ligands <- ggplot(popular_ligands, aes(x = reorder(res_name, -n), y = n, fill = n)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90)) +
xlab("ligand")+
ylab("liczność") +
labs(title = "Liczność ligandów według nazwy")
ggplotly(plot_ligands)
Korelacja między zmiennymi
# data %>%
# select_if(is.numeric) %>%
# cor %>%
# corrplot(method = "circle", tl.col = "black", tl.srt = 45)
Rozkłady gęstościowe liczb
Atomów
plot_atom <- ggplot(data, aes(x = local_res_atom_non_h_count)) +
geom_density(alpha = .3, fill = "#00CECB", color = NA) +
xlab("liczność atomów") +
ylab("gęstość") +
labs(title = "Rozkład gęstościowy atomów")
ggplotly(plot_atom)
Elektronów
plot_electron <- ggplot(data, aes(x = local_res_atom_non_h_electron_sum)) +
geom_density(alpha = .3, fill = "#FF5E5B", color = NA) +
xlab("liczność elektronów") +
ylab("gęstość") +
labs(title = "Rozkład gęstościowy elektronów")
ggplotly(plot_electron)
Rozkład wartości kolumn part_01
# plot_part_data <- data %>%
# select(contains("part_01")) %>%
# gather(part, value, 1:106)
#
# dim(plot_part_data)
#
# plot_part_data_1 <- plot_part_data[1:118926,]
# plot_part_data_2 <- plot_part_data[118927:237852,]
# plot_part_data_3 <- plot_part_data[237853:356778,]
# plot_part_data_4 <- plot_part_data[356779:475704,]
# plot_part_data_5 <- plot_part_data[475705:594630,]
# plot_part_data_6 <- plot_part_data[594631:700342,]
#
# plot_ly(plot_part_data_1, x = plot_part_data_1$part, y = plot_part_data_1$value, type = 'box')
# plot_ly(plot_part_data_2, x = plot_part_data_2$part, y = plot_part_data_2$value, type = 'box')
# plot_ly(plot_part_data_3, x = plot_part_data_3$part, y = plot_part_data_3$value, type = 'box')
# plot_ly(plot_part_data_4, x = plot_part_data_4$part, y = plot_part_data_4$value, type = 'box')
# plot_ly(plot_part_data_5, x = plot_part_data_5$part, y = plot_part_data_5$value, type = 'box')
# plot_ly(plot_part_data_6, x = plot_part_data_6$part, y = plot_part_data_6$value, type = 'box')
Największe niezgodności liczby
Atomów
data %>%
select(res_name, local_res_atom_non_h_count, dict_atom_non_h_count) %>%
group_by(res_name) %>%
summarise(atom_inconsistency = mean(abs(local_res_atom_non_h_count - dict_atom_non_h_count))) %>%
arrange(-atom_inconsistency) %>%
slice(1:10) %>%
kable()
| PLC |
17.1481481 |
| LHG |
4.4615385 |
| C8E |
2.6428571 |
| NDP |
1.7333333 |
| NAP |
1.5090909 |
| PG4 |
1.4225352 |
| MLY |
1.2222222 |
| CME |
1.0000000 |
| MAN |
1.0000000 |
| NAG |
0.9949495 |
Elektronów
data %>%
select(res_name, local_res_atom_non_h_electron_sum, dict_atom_non_h_electron_sum) %>%
group_by(res_name) %>%
summarise(electron_inconsistency = mean(abs(local_res_atom_non_h_electron_sum - dict_atom_non_h_electron_sum))) %>%
arrange(-electron_inconsistency) %>%
slice(1:10) %>%
kable()
| PLC |
114.444444 |
| LHG |
34.096154 |
| C8E |
16.714286 |
| NDP |
11.333333 |
| NAP |
10.654545 |
| PG4 |
9.633803 |
| MLY |
9.370370 |
| CME |
8.000000 |
| MAN |
8.000000 |
| NAG |
7.959596 |
Regresja liniowa
Liczba atomów
# data_partition <- data %>%
# select_if(is.numeric)
#
# set.seed(111)
# partition <- createDataPartition(
# y = data_partition$local_res_atom_non_h_count,
# p = .7,
# list = FALSE)
#
# data_train <- data_partition %>%
# slice(partition)
# data_test <- data_partition %>%
# slice(-partition)
# dim(data_train)
# dim(data_test)
#
# set.seed(111)
# fit <- train(local_res_atom_non_h_count ~ ., data = data_train, method = "lm")
# fit
#
# set.seed(111)
# prediction <- predict(fit, newdata = data_test)
# postResample(pred = prediction, obs = data_test$local_res_atom_non_h_count)
Liczba elektronów
# data_partition <- data %>%
# select_if(is.numeric)
#
# set.seed(111)
# partition <- createDataPartition(
# y = data_partition$local_res_atom_non_h_electron_sum,
# p = .7,
# list = FALSE)
#
# data_train <- data_partition %>%
# slice(partition)
# data_test <- data_partition %>%
# slice(-partition)
# dim(data_train)
# dim(data_test)
#
# set.seed(111)
# fit <- train(local_res_atom_non_h_electron_sum ~ ., data = data_train, method = "lm")
# fit
#
# set.seed(111)
# prediction <- predict(fit, newdata = data_test)
# postResample(pred = prediction, obs = data_test$local_res_atom_non_h_electron_sum)
Klasyfikator
Przewidywanie wartości res_name
dim(data)
## [1] 6239 350
removable_columns <- c("blob_coverage", "res_coverage", "local_res_atom_non_h_count", "local_res_atom_non_h_electron_sum", "dict_atom_non_h_count", "dict_atom_non_h_electron_sum")
data_partition <- data %>%
select(-removable_columns)
dim(data_partition)
## [1] 6239 344
data_partition$res_name <- as.factor(data_partition$res_name)
set.seed(111)
partition <- createDataPartition(
y = data_partition$res_name,
p = .7,
list = FALSE)
data_train <- data_partition %>%
slice(partition)
data_test <- data_partition %>%
slice(-partition)
dim(data_train)
## [1] 4391 344
dim(data_test)
## [1] 1848 344
set.seed(111)
fit <- train(
res_name ~ .,
data = data_train,
method = "rf",
ntree = 10,
na.action = na.pass)
fit
## Random Forest
##
## 4391 samples
## 343 predictor
## 50 classes: 'ACT', 'ACY', 'ADP', 'AMP', 'BR', 'C8E', 'CA', 'CD', 'CIT', 'CL', 'CME', 'CU', 'CYC', 'DMS', 'EDO', 'EPE', 'FAD', 'FE', 'FE2', 'FEC', 'FES', 'FMN', 'FMT', 'GLC', 'GOL', 'H4B', 'HEC', 'HEM', 'IOD', 'K', 'LHG', 'MAN', 'MES', 'MG', 'MLY', 'MN', 'MPD', 'NAD', 'NAG', 'NAP', 'NDP', 'NI', 'PEG', 'PG4', 'PGE', 'PLC', 'PLP', 'PO4', 'SO4', 'ZN'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 4391, 4391, 4391, 4391, 4391, 4391, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.3910456 0.3411316
## 172 0.5329791 0.4972496
## 343 0.5309613 0.4952548
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 172.
set.seed(111)
prediction <- predict(fit, newdata = data_test)
confusionMatrix(data = prediction, data_test$res_name)
## Confusion Matrix and Statistics
##
## Reference
## Prediction ACT ACY ADP AMP BR C8E CA CD CIT CL CME CU CYC DMS EDO EPE
## ACT 6 0 0 0 0 0 0 0 0 2 1 0 0 0 5 0
## ACY 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## ADP 0 0 5 0 0 0 0 0 0 0 0 0 0 0 0 0
## AMP 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## BR 0 0 0 0 4 0 1 0 0 0 0 0 0 0 0 0
## C8E 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0
## CA 0 0 0 0 0 0 59 2 0 3 0 1 0 0 0 0
## CD 0 0 0 0 0 0 1 10 0 0 0 0 0 0 0 0
## CIT 1 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0
## CL 0 0 0 0 4 0 7 2 0 89 0 0 0 0 0 0
## CME 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0
## CU 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0
## CYC 0 0 0 0 0 0 0 0 0 0 0 0 4 0 0 0
## DMS 1 0 0 0 0 0 0 0 0 0 0 0 0 72 4 1
## EDO 8 1 0 0 0 1 1 0 2 2 1 1 1 5 76 1
## EPE 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4
## FAD 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## FE 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## FE2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## FEC 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## FES 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## FMN 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0
## FMT 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## GLC 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## GOL 9 3 2 3 0 2 0 0 1 0 0 0 0 3 28 1
## H4B 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## HEC 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## HEM 0 0 1 0 0 1 0 0 0 0 0 0 0 0 2 0
## IOD 0 0 0 0 0 0 3 0 0 2 0 0 0 0 0 0
## K 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0
## LHG 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## MAN 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## MES 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## MG 0 0 0 0 1 0 8 2 0 11 0 0 0 0 1 0
## MLY 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## MN 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## MPD 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## NAD 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## NAG 1 0 1 1 0 0 0 0 0 0 1 0 1 0 2 0
## NAP 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## NDP 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## NI 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## PEG 0 0 0 0 0 0 0 0 0 0 1 0 0 0 3 0
## PG4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## PGE 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## PLC 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## PLP 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 2
## PO4 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0
## SO4 8 3 2 5 0 1 0 1 0 0 0 0 1 18 14 3
## ZN 0 0 0 0 0 0 2 7 0 3 0 4 0 0 0 0
## Reference
## Prediction FAD FE FE2 FEC FES FMN FMT GLC GOL H4B HEC HEM IOD K LHG MAN
## ACT 0 0 0 0 0 0 2 0 5 0 0 3 0 0 0 0
## ACY 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## ADP 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## AMP 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## BR 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## C8E 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## CA 0 0 0 0 0 0 0 0 0 0 0 0 1 12 0 0
## CD 0 0 1 0 1 0 0 0 0 0 0 0 0 2 0 0
## CIT 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## CL 0 0 0 0 0 0 0 0 0 0 0 0 5 3 0 0
## CME 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## CU 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## CYC 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## DMS 0 0 0 0 0 0 1 0 2 0 0 0 0 0 0 0
## EDO 2 0 0 1 0 0 3 0 26 0 1 7 1 0 0 0
## EPE 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## FAD 14 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## FE 0 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## FE2 0 0 8 0 0 0 0 0 0 0 0 0 0 0 0 0
## FEC 0 0 0 4 0 0 0 0 1 0 0 0 0 0 0 0
## FES 0 0 0 0 5 0 0 0 0 0 0 0 0 0 0 0
## FMN 1 0 0 0 0 11 0 0 0 0 0 0 0 0 0 0
## FMT 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## GLC 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## GOL 2 0 0 1 0 2 3 3 101 0 0 6 0 0 3 4
## H4B 1 0 0 0 0 1 0 0 0 5 0 0 0 0 0 0
## HEC 0 0 0 0 0 0 0 0 0 0 5 0 0 0 0 0
## HEM 2 0 0 1 0 0 0 0 1 0 4 42 0 0 0 1
## IOD 0 0 0 0 0 0 0 0 0 0 0 0 18 0 0 0
## K 0 0 0 0 0 0 0 0 0 0 0 0 0 4 0 0
## LHG 0 0 0 0 0 0 0 0 0 0 0 0 0 0 12 0
## MAN 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3
## MES 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## MG 0 1 0 0 0 0 0 0 0 0 0 0 2 1 0 0
## MLY 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0
## MN 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## MPD 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## NAD 1 0 0 0 0 0 0 0 1 1 0 2 0 0 0 0
## NAG 1 0 0 0 0 0 0 1 1 1 0 1 0 0 0 1
## NAP 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## NDP 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## NI 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## PEG 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0
## PG4 2 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## PGE 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## PLC 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## PLP 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0
## PO4 0 0 0 0 1 1 0 0 3 0 0 0 0 0 0 0
## SO4 4 0 0 0 0 1 0 1 27 0 0 7 1 0 0 0
## ZN 0 3 3 0 1 0 0 0 0 0 0 0 0 0 0 0
## Reference
## Prediction MES MG MLY MN MPD NAD NAG NAP NDP NI PEG PG4 PGE PLC PLP PO4
## ACT 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## ACY 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## ADP 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## AMP 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## BR 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## C8E 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## CA 0 14 0 10 0 0 0 0 0 0 0 0 0 0 0 0
## CD 0 0 0 1 0 0 0 0 0 2 0 0 0 0 0 0
## CIT 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## CL 0 7 0 0 0 0 0 0 0 1 1 0 0 0 0 0
## CME 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## CU 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## CYC 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## DMS 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## EDO 0 2 1 0 1 0 3 0 0 0 7 1 2 0 0 1
## EPE 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## FAD 0 0 0 0 0 3 0 0 0 0 0 0 0 0 1 0
## FE 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## FE2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## FEC 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## FES 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## FMN 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## FMT 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## GLC 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## GOL 2 1 3 0 4 10 14 3 3 0 15 7 1 0 2 7
## H4B 0 0 0 0 0 0 2 0 1 0 0 0 0 0 0 0
## HEC 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## HEM 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 2
## IOD 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## K 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## LHG 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## MAN 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## MES 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## MG 0 28 0 2 0 1 0 0 0 1 0 0 0 0 0 0
## MLY 0 0 7 0 0 0 0 0 0 0 1 0 0 0 0 0
## MN 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## MPD 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
## NAD 0 0 0 0 1 17 3 0 3 0 0 0 0 0 1 0
## NAG 1 0 2 0 0 4 87 0 0 0 2 3 3 0 2 1
## NAP 0 0 0 0 0 0 0 9 2 0 0 0 0 0 0 0
## NDP 0 0 0 0 0 0 1 1 2 0 0 0 0 0 0 0
## NI 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## PEG 1 0 0 0 0 1 0 0 0 0 2 2 1 0 0 0
## PG4 0 0 0 0 0 0 1 0 0 0 2 3 0 0 0 0
## PGE 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## PLC 0 0 0 0 0 0 0 0 0 0 0 0 0 8 0 0
## PLP 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1
## PO4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 10
## SO4 0 0 2 0 2 1 3 3 1 0 1 2 1 0 1 27
## ZN 0 1 0 1 0 0 0 0 0 2 0 0 0 0 0 0
## Reference
## Prediction SO4 ZN
## ACT 1 0
## ACY 0 0
## ADP 0 0
## AMP 0 0
## BR 0 0
## C8E 0 0
## CA 0 9
## CD 0 6
## CIT 0 0
## CL 0 2
## CME 1 0
## CU 0 0
## CYC 0 0
## DMS 3 0
## EDO 16 0
## EPE 0 0
## FAD 0 0
## FE 1 2
## FE2 0 1
## FEC 0 0
## FES 0 0
## FMN 0 0
## FMT 1 0
## GLC 0 0
## GOL 24 0
## H4B 0 0
## HEC 0 0
## HEM 1 0
## IOD 0 0
## K 1 1
## LHG 0 0
## MAN 1 0
## MES 0 0
## MG 0 6
## MLY 0 0
## MN 0 2
## MPD 0 0
## NAD 1 0
## NAG 1 0
## NAP 0 0
## NDP 0 0
## NI 0 1
## PEG 1 0
## PG4 0 0
## PGE 0 0
## PLC 0 0
## PLP 0 0
## PO4 11 0
## SO4 230 1
## ZN 0 64
##
## Overall Statistics
##
## Accuracy : 0.5703
## 95% CI : (0.5474, 0.5931)
## No Information Rate : 0.1591
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5361
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: ACT Class: ACY Class: ADP Class: AMP Class: BR
## Sensitivity 0.176471 0.363636 0.416667 0.000000 0.444444
## Specificity 0.988975 0.999456 0.999455 0.998910 0.999456
## Pos Pred Value 0.230769 0.800000 0.833333 0.000000 0.800000
## Neg Pred Value 0.984632 0.996202 0.996200 0.992958 0.997287
## Prevalence 0.018398 0.005952 0.006494 0.007035 0.004870
## Detection Rate 0.003247 0.002165 0.002706 0.000000 0.002165
## Detection Prevalence 0.014069 0.002706 0.003247 0.001082 0.002706
## Balanced Accuracy 0.582723 0.681546 0.708061 0.499455 0.721950
## Class: C8E Class: CA Class: CD Class: CIT Class: CL
## Sensitivity 0.250000 0.70238 0.384615 0.250000 0.78070
## Specificity 1.000000 0.97052 0.992316 0.998913 0.98155
## Pos Pred Value 1.000000 0.53153 0.416667 0.500000 0.73554
## Neg Pred Value 0.996750 0.98561 0.991228 0.996746 0.98552
## Prevalence 0.004329 0.04545 0.014069 0.004329 0.06169
## Detection Rate 0.001082 0.03193 0.005411 0.001082 0.04816
## Detection Prevalence 0.001082 0.06006 0.012987 0.002165 0.06548
## Balanced Accuracy 0.625000 0.83645 0.688466 0.624457 0.88112
## Class: CME Class: CU Class: CYC Class: DMS Class: EDO
## Sensitivity 0.500000 0.333333 0.571429 0.73469 0.54676
## Specificity 0.997826 1.000000 1.000000 0.99200 0.94207
## Pos Pred Value 0.500000 1.000000 1.000000 0.83721 0.43429
## Neg Pred Value 0.997826 0.996748 0.998373 0.98524 0.96234
## Prevalence 0.004329 0.004870 0.003788 0.05303 0.07522
## Detection Rate 0.002165 0.001623 0.002165 0.03896 0.04113
## Detection Prevalence 0.004329 0.001623 0.002165 0.04654 0.09470
## Balanced Accuracy 0.748913 0.666667 0.785714 0.86335 0.74442
## Class: EPE Class: FAD Class: FE Class: FE2 Class: FEC
## Sensitivity 0.333333 0.424242 0.555556 0.666667 0.571429
## Specificity 0.999455 0.997245 0.997281 0.999455 0.998914
## Pos Pred Value 0.800000 0.736842 0.500000 0.888889 0.666667
## Neg Pred Value 0.995659 0.989612 0.997824 0.997825 0.998371
## Prevalence 0.006494 0.017857 0.004870 0.006494 0.003788
## Detection Rate 0.002165 0.007576 0.002706 0.004329 0.002165
## Detection Prevalence 0.002706 0.010281 0.005411 0.004870 0.003247
## Balanced Accuracy 0.666394 0.710744 0.776418 0.833061 0.785171
## Class: FES Class: FMN Class: FMT Class: GLC
## Sensitivity 0.625000 0.687500 0.0000000 0.000000
## Specificity 0.999457 0.998362 0.9994562 0.997827
## Pos Pred Value 0.833333 0.785714 0.0000000 0.000000
## Neg Pred Value 0.998371 0.997274 0.9951272 0.996204
## Prevalence 0.004329 0.008658 0.0048701 0.003788
## Detection Rate 0.002706 0.005952 0.0000000 0.000000
## Detection Prevalence 0.003247 0.007576 0.0005411 0.002165
## Balanced Accuracy 0.812228 0.842931 0.4997281 0.498914
## Class: GOL Class: H4B Class: HEC Class: HEM
## Sensitivity 0.57386 0.625000 0.500000 0.60000
## Specificity 0.89713 0.997283 1.000000 0.98988
## Pos Pred Value 0.36996 0.500000 1.000000 0.70000
## Neg Pred Value 0.95238 0.998368 0.997287 0.98434
## Prevalence 0.09524 0.004329 0.005411 0.03788
## Detection Rate 0.05465 0.002706 0.002706 0.02273
## Detection Prevalence 0.14773 0.005411 0.002706 0.03247
## Balanced Accuracy 0.73550 0.811141 0.750000 0.79494
## Class: IOD Class: K Class: LHG Class: MAN Class: MES
## Sensitivity 0.64286 0.181818 0.800000 0.333333 0.285714
## Specificity 0.99670 0.996714 0.999454 0.999456 0.998914
## Pos Pred Value 0.75000 0.400000 0.923077 0.750000 0.500000
## Neg Pred Value 0.99452 0.990207 0.998365 0.996746 0.997289
## Prevalence 0.01515 0.011905 0.008117 0.004870 0.003788
## Detection Rate 0.00974 0.002165 0.006494 0.001623 0.001082
## Detection Prevalence 0.01299 0.005411 0.007035 0.002165 0.002165
## Balanced Accuracy 0.81978 0.589266 0.899727 0.666395 0.642314
## Class: MG Class: MLY Class: MN Class: MPD Class: NAD
## Sensitivity 0.48276 0.437500 0.0588235 0.000000 0.447368
## Specificity 0.97933 0.998362 0.9978154 0.997826 0.992265
## Pos Pred Value 0.43077 0.700000 0.2000000 0.000000 0.548387
## Neg Pred Value 0.98317 0.995103 0.9913185 0.995662 0.988442
## Prevalence 0.03139 0.008658 0.0091991 0.004329 0.020563
## Detection Rate 0.01515 0.003788 0.0005411 0.000000 0.009199
## Detection Prevalence 0.03517 0.005411 0.0027056 0.002165 0.016775
## Balanced Accuracy 0.73104 0.717931 0.5283195 0.498913 0.719817
## Class: NAG Class: NAP Class: NDP Class: NI Class: PEG
## Sensitivity 0.73729 0.562500 0.153846 0.1428571 0.064516
## Specificity 0.98150 0.998362 0.997820 0.9989136 0.993396
## Pos Pred Value 0.73109 0.750000 0.333333 0.3333333 0.142857
## Neg Pred Value 0.98207 0.996187 0.994028 0.9967480 0.984188
## Prevalence 0.06385 0.008658 0.007035 0.0037879 0.016775
## Detection Rate 0.04708 0.004870 0.001082 0.0005411 0.001082
## Detection Prevalence 0.06439 0.006494 0.003247 0.0016234 0.007576
## Balanced Accuracy 0.85940 0.780431 0.575833 0.5708854 0.528956
## Class: PG4 Class: PGE Class: PLC Class: PLP
## Sensitivity 0.142857 0.0000000 1.000000 0.166667
## Specificity 0.996169 0.9994565 1.000000 0.996732
## Pos Pred Value 0.300000 0.0000000 1.000000 0.250000
## Neg Pred Value 0.990207 0.9956687 1.000000 0.994565
## Prevalence 0.011364 0.0043290 0.004329 0.006494
## Detection Rate 0.001623 0.0000000 0.004329 0.001082
## Detection Prevalence 0.005411 0.0005411 0.004329 0.004329
## Balanced Accuracy 0.569513 0.4997283 1.000000 0.581699
## Class: PO4 Class: SO4 Class: ZN
## Sensitivity 0.200000 0.7823 0.67368
## Specificity 0.988877 0.9086 0.98460
## Pos Pred Value 0.333333 0.6183 0.70330
## Neg Pred Value 0.977998 0.9566 0.98236
## Prevalence 0.027056 0.1591 0.05141
## Detection Rate 0.005411 0.1245 0.03463
## Detection Prevalence 0.016234 0.2013 0.04924
## Balanced Accuracy 0.594438 0.8455 0.82914